{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 手写字母XO数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 安装依赖包"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```bash\n",
    "sudo pip3 install mlxtend\n",
    "sudo pip3 install keras\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 载入数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mlxtend.data import loadlocal_mnist\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "# 标签数据的二进制文件路径\n",
    "LABEL_FILE = \"emnist-letters-train-labels-idx1-ubyte\";\n",
    "# 图像数据的二进制文件路径\n",
    "IMAGE_FILE = \"emnist-letters-train-images-idx3-ubyte\";\n",
    "# 载入训练集与标签\n",
    "X, y = loadlocal_mnist(images_path=IMAGE_FILE,labels_path=LABEL_FILE)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 显示样例数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "z\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAADohJREFUeJzt3X+IVfeZx/HP468gtmhcXZHUqFOTBaOsXQZZ3Ji4NDFuCJrmj1BJFjdIp39U2EIDG9zA+k9AlrRNSaBkpFoj3aihDREiu82aDdNAU2J+rEazbWbDmCo6E+OPWklwR5/9Y45lYuZ8z829595zJs/7BTL3nud+5z5c/My5937POV9zdwGIZ0LVDQCoBuEHgiL8QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxDUpE4+mZlxOCHQZu5ujTyupT2/ma0xs9+aWb+ZPdLK7wLQWdbssf1mNlHS7yTdKem4pNclrXf3o4kx7PmBNuvEnn+5pH53f9/dL0naLWldC78PQAe1Ev4bJP1+1P3j2bZPMbMeMztoZgdbeC4AJWv7F37u3iupV+JtP1Anrez5T0iaN+r+V7JtAMaBVsL/uqSbzGyhmU2R9E1J+8ppC0C7Nf22392HzWyTpP+QNFHSdnc/UlpnKMWkSa19shseHi6pE9RN01N9TT0Zn/k7jvDH05GDfACMX4QfCIrwA0ERfiAowg8ERfiBoDp6Pj/aY9asWbm19evXJ8fOmDEjWd+/f3+yfv78+WS9SufOncutnT59uoOd1BN7fiAowg8ERfiBoAg/EBThB4Ii/EBQnNVXA2bpk7DWrl2brD/00EO5tbvuuis59rrrrkvWL1++nKxfuXKlpfEpEyak901Fr9uxY8dyaytWrEiOHc9TgZzVByCJ8ANBEX4gKMIPBEX4gaAIPxAU4QeCYp6/BlKn5ErSq6++mqzPnz8/t1Y0V150Su7Zs2eT9QsXLiTrr7zySm7t4sWLybFLlixJ1leuXJmsT58+Pbd2yy23JMf29/cn63XGPD+AJMIPBEX4gaAIPxAU4QeCIvxAUIQfCKqlS3eb2YCkC5IuSxp29+4ymopm2rRpyfpbb72VrO/Zsye3VnQ+/eHDh1uqF63iOzg42PTYouMfNm/enKxv3LgxWY+ujOv2/627j98rHwBB8bYfCKrV8LukX5rZG2bWU0ZDADqj1bf9t7r7CTP7c0kvmdn/uHvf6AdkfxT4wwDUTEt7fnc/kf0ckvS8pOVjPKbX3bv5MhCol6bDb2bTzOzLV29LWi3pnbIaA9BerbztnyPp+ezyyZMk/Zu7/3spXQFou6bD7+7vS/rLEnsJK3V9eUl64IEHkvWia+ePV7Nnz07Wb7vttmQ9dYxD0TEGETDVBwRF+IGgCD8QFOEHgiL8QFCEHwiqjLP60GZf1Km8SZPS//1Wr16drHd1dSXrqctvnzp1Kjk2Avb8QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxAU8/yozIIFC5L1oiW4i5YHf+KJJ3Jrn3zySXJsBOz5gaAIPxAU4QeCIvxAUIQfCIrwA0ERfiAo5vnRVlOnTs2tPffcc8mxixYtStYff/zxZH3v3r3JenTs+YGgCD8QFOEHgiL8QFCEHwiK8ANBEX4gqMJ5fjPbLukeSUPuviTbNlPSHkkLJA1Iut/dz7avTdSVmSXrN998c26taB7/5ZdfTtZ37dqVrH/88cfJenSN7Pl/KmnNNdsekXTA3W+SdCC7D2AcKQy/u/dJOnPN5nWSdma3d0q6t+S+ALRZs5/557j7yez2KUlzSuoHQIe0fGy/u7uZeV7dzHok9bT6PADK1eyef9DM5kpS9nMo74Hu3uvu3e7e3eRzAWiDZsO/T9KG7PYGSS+U0w6ATikMv5k9K+nXkv7CzI6b2UZJWyXdaWbvSbojuw9gHCn8zO/u63NKXy+5F9RQ0Tx+V1dXsv7www83/dx9fX3J+gcffND07wZH+AFhEX4gKMIPBEX4gaAIPxAU4QeC4tLdSFq7dm2yXnT57Pnz5+fWenrSR33v2bMnWR8eHk7WkcaeHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCYp4/uEmT0v8FHnzwwWQ9NY8vSceOHcut7d+/PzmWS2+3F3t+ICjCDwRF+IGgCD8QFOEHgiL8QFCEHwiKef4vuIkTJybrt99+e0v1M2euXcP10x577LHc2ocffpgci/Zizw8ERfiBoAg/EBThB4Ii/EBQhB8IivADQRXO85vZdkn3SBpy9yXZti2SviXp6kTtZndPn5yNtkkto71w4cLk2E2bNiXr06ZNS9b37t2brKfO2Xf35Fi0VyN7/p9KWjPG9h+6+7LsH8EHxpnC8Lt7n6T0YVwAxp1WPvNvMrNDZrbdzK4vrSMAHdFs+H8s6auSlkk6Ken7eQ80sx4zO2hmB5t8LgBt0FT43X3Q3S+7+xVJ2yQtTzy219273b272SYBlK+p8JvZ3FF3vyHpnXLaAdApjUz1PStplaRZZnZc0r9IWmVmyyS5pAFJ325jjwDawDo512pmTOw2oeic/KVLl+bWduzYkRy7ePHiZL2npydZL5rn59r7nefu+Qd+jMIRfkBQhB8IivADQRF+ICjCDwRF+IGguHT3OLBq1apkPXVa7qJFi5Jj+/r6kvUXX3wxWWcqb/xizw8ERfiBoAg/EBThB4Ii/EBQhB8IivADQTHPXwNTp05N1osur33HHXfk1vr7+5Njn3rqqWT99OnTyTrGL/b8QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxAU8/wlSC2RLUldXV3J+pYtW5L1FStWJOu7d+/OrT366KPJsYODg8k6vrjY8wNBEX4gKMIPBEX4gaAIPxAU4QeCIvxAUIXz/GY2T9IzkuZIckm97v4jM5spaY+kBZIGJN3v7mfb12p9zZ49O1kvmmu/7777kvVt27Yl61u3bs2tDQ0NJccirkb2/MOSvufuiyX9taTvmNliSY9IOuDuN0k6kN0HME4Uht/dT7r7m9ntC5LelXSDpHWSdmYP2ynp3nY1CaB8n+szv5ktkPQ1Sb+RNMfdT2alUxr5WABgnGj42H4z+5Kkn0v6rrv/YfTx7O7uZuY543ok9bTaKIByNbTnN7PJGgn+z9z9F9nmQTObm9XnShrzmyV373X3bnfvLqNhAOUoDL+N7OJ/Iuldd//BqNI+SRuy2xskvVB+ewDapZG3/X8j6e8lHTazt7NtmyVtlbTXzDZKOibp/va0WH8rV65M1tesWZOsX7x4MVk/cOBAsp6aznMf89MYUBx+d39VUt4J618vtx0AncIRfkBQhB8IivADQRF+ICjCDwRF+IGgrJPzwHmHAI8Hs2bNyq299tprybE33nhjsv70008n608++WSyfuXKlWS9FRMnTkzWFy9enKwvXbo0t1Z0yfNWpV6XXbt2JccODAyU3E3nuHtDLyx7fiAowg8ERfiBoAg/EBThB4Ii/EBQhB8Iinn+Bi1atCi3duTIkeTYyZMnJ+tnzpxJ1j/66KNkvZ2K5uKnT5+erM+YMaPp392q1Dz/pk2bkmN37NiRrF++fLmpnjqBeX4ASYQfCIrwA0ERfiAowg8ERfiBoAg/EFTDy3VFNzw8nFu7dOlScuyUKVOS9ZkzZybrqblyqb3n8589m151/fz588n6uXPnymznc0nNxRddg6HO8/hlYc8PBEX4gaAIPxAU4QeCIvxAUIQfCIrwA0EVns9vZvMkPSNpjiSX1OvuPzKzLZK+JenD7KGb3X1/we8at+fzp849v+eee5Jjly1blqxPmJD+G3zo0KFk/ejRo7m1VuerW53H7+T1Ij6P1HEb412j5/M3cpDPsKTvufubZvZlSW+Y2UtZ7Yfu/nizTQKoTmH43f2kpJPZ7Qtm9q6kG9rdGID2+lyf+c1sgaSvSfpNtmmTmR0ys+1mdn3OmB4zO2hmB1vqFECpGg6/mX1J0s8lfdfd/yDpx5K+KmmZRt4ZfH+sce7e6+7d7t5dQr8AStJQ+M1sskaC/zN3/4Ukufugu1929yuStkla3r42AZStMPw28jX3TyS96+4/GLV97qiHfUPSO+W3B6BdGpnqu1XSryQdlnT13NHNktZr5C2/SxqQ9O3sy8HU76rnvE+LiqbqiupFik7ZbecpvRh/Gp3q47r9JSD8qBOu2w8gifADQRF+ICjCDwRF+IGgCD8QFFN9wBcMU30Akgg/EBThB4Ii/EBQhB8IivADQRF+IKhOL9F9WtKxUfdnZdvqqK691bUvid6aVWZv8xt9YEcP8vnMk5sdrOu1/eraW137kuitWVX1xtt+ICjCDwRVdfh7K37+lLr2Vte+JHprViW9VfqZH0B1qt7zA6hIJeE3szVm9lsz6zezR6roIY+ZDZjZYTN7u+olxrJl0IbM7J1R22aa2Utm9l72c8xl0irqbYuZncheu7fN7O6KeptnZv9lZkfN7IiZ/WO2vdLXLtFXJa9bx9/2m9lESb+TdKek45Jel7Te3fPXme4gMxuQ1O3ulc8Jm9ltkv4o6Rl3X5Jt+1dJZ9x9a/aH83p3/6ea9LZF0h+rXrk5W1Bm7uiVpSXdK+kfVOFrl+jrflXwulWx518uqd/d33f3S5J2S1pXQR+15+59ks5cs3mdpJ3Z7Z0a+c/TcTm91YK7n3T3N7PbFyRdXVm60tcu0Vclqgj/DZJ+P+r+cdVryW+X9Esze8PMeqpuZgxzRq2MdErSnCqbGUPhys2ddM3K0rV57ZpZ8bpsfOH3Wbe6+19J+jtJ38ne3taSj3xmq9N0TUMrN3fKGCtL/0mVr12zK16XrYrwn5A0b9T9r2TbasHdT2Q/hyQ9r/qtPjx4dZHU7OdQxf38SZ1Wbh5rZWnV4LWr04rXVYT/dUk3mdlCM5si6ZuS9lXQx2eY2bTsixiZ2TRJq1W/1Yf3SdqQ3d4g6YUKe/mUuqzcnLeytCp+7Wq34rW7d/yfpLs18o3//0r65yp6yOmrS9J/Z/+OVN2bpGc18jbw/zTy3chGSX8m6YCk9yT9p6SZNeptl0ZWcz6kkaDNrai3WzXylv6QpLezf3dX/dol+qrkdeMIPyAovvADgiL8QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxDU/wPaEdR01wm5JwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "def array2img(x):\n",
    "    '''将(784,)的向量转换为图片格式28*28'''\n",
    "    x_img = x.reshape((28, 28))\n",
    "    x_img = x_img.T\n",
    "    return x_img\n",
    "\n",
    "def num2letter(num):\n",
    "    '''将标签数字转换为字母'''\n",
    "    return chr(ord('a') + num - 1)\n",
    "\n",
    "# 显示图片\n",
    "plt.imshow(array2img(X[14]), cmap='gray')\n",
    "print(num2letter(y[14]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 筛选数据\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "去除无关字符的测试集, 只保留`X`跟`O`对应的数据集还有标签，并将标签替换为0跟1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "def letter2label(letter):\n",
    "    return ord(letter) - ord('a') + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 获取字母X跟O对应的序号\n",
    "xo_indexs = np.where(np.logical_or(y==letter2label('o'), y==letter2label('x')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([     3,      9,     17, ..., 124742, 124748, 124798]),)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xo_indexs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 索引数据\n",
    "X_train = X[xo_indexs]\n",
    "y_train = y[xo_indexs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 标签转换为0,1\n",
    "y_train = np.uint8(y_train == letter2label('x'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试集执行相同操作\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_image_file = 'emnist-letters-test-images-idx3-ubyte'\n",
    "test_label_file = 'emnist-letters-test-labels-idx1-ubyte'\n",
    "X_test, y_test = loadlocal_mnist(images_path=test_image_file,labels_path=test_label_file)\n",
    "xo_indexs = np.where(np.logical_or(y_test==letter2label('o'), y_test==letter2label('x')))\n",
    "X_test = X_test[xo_indexs]\n",
    "y_test = y_test[xo_indexs]\n",
    "y_test = np.uint8(y_test == letter2label('x'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 序列化保存数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 创建一个字典格式的数据集\n",
    "dataset = {\n",
    "    'X_train': X_train,\n",
    "    'y_train': y_train,\n",
    "    'X_test': X_test,\n",
    "    'y_test': y_test\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 序列化保存在文件\n",
    "with open('xo_dataset.txt', 'wb') as f:\n",
    "    pickle.dump(dataset, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
